Part 1: Cosine Distance

This document contains analyses of BERT cosine distances on zeugmatic sentences with ambiguous words, and compares those distances to human similarity judgments.

Load data

First, we load the data with summary statistics about each item. We also load the BERT cosine distances.

### Set working directory (comment this out to run)
# setwd("/Users/seantrott/Dropbox/UCSD/Research/Ambiguity/SSD/zeugma_norms/src/analysis")

### Load norming data
df_normed = read_csv("../../data/raw/similarity.csv")
## Parsed with column specification:
## cols(
##   CW = col_character(),
##   String = col_character(),
##   `Dominance Score` = col_character(),
##   `Alternative Dominance Score from Armstrong et al. (2012) EDOM` = col_character(),
##   `Similarity Norming Category` = col_character(),
##   `Zeugmatic Similarity Norming Sentence` = col_character(),
##   Anaphora = col_character(),
##   `Similarity Mean` = col_double(),
##   `Similarity STDEV` = col_double(),
##   `Similarity SEM` = col_double()
## )
nrow(df_normed)
## [1] 320
df_normed = df_normed %>%
  mutate(word = tolower(CW))

### Load BERT distances
df_bert = read_csv("../../data/processed/distances.csv") %>%
  select(-X1)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   `Similarity Norming Category` = col_character(),
##   distance_bert_large_hf_layer_1 = col_double(),
##   distance_bert_large_hf_layer_10 = col_double(),
##   distance_bert_large_hf_layer_11 = col_double(),
##   distance_bert_large_hf_layer_12 = col_double(),
##   distance_bert_large_hf_layer_2 = col_double(),
##   distance_bert_large_hf_layer_3 = col_double(),
##   distance_bert_large_hf_layer_4 = col_double(),
##   distance_bert_large_hf_layer_5 = col_double(),
##   distance_bert_large_hf_layer_6 = col_double(),
##   distance_bert_large_hf_layer_7 = col_double(),
##   distance_bert_large_hf_layer_8 = col_double(),
##   distance_bert_large_hf_layer_9 = col_double(),
##   string = col_character(),
##   word = col_character()
## )
nrow(df_bert)
## [1] 314

We then merge them together:

df_merged = df_normed %>%
  inner_join(df_bert, by = c('word', 'Similarity Norming Category')) %>%
  mutate(ambiguity_type = `Similarity Norming Category`,
         sim = `Similarity Mean`)
nrow(df_merged)
## [1] 314

Does Ambiguity Type predict cosine distance?

We know that the similarity scores reflect the underlying Ambiguity Type.

df_merged %>%
  ggplot(aes(x = ambiguity_type,
             y = sim,
             fill = ambiguity_type)) +
  geom_boxplot() +
  labs(x = "Ambiguity Type",
       y = "Similarity Judgment",
       fill = "Ambiguity Type") +
  theme_minimal()

df_merged %>%
  ggplot(aes(x = sim,
             y = ambiguity_type,
             fill = ambiguity_type)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = 0.5, 
                       scale=0.85, 
                       size=.9, 
                       stat="density") +
  labs(x = "Similarity Judgment",
       y = "Ambiguity type") +
  theme_minimal()

However, this effect seems considerably weaker for the cosine distance measures:

df_merged %>%
  ggplot(aes(x = ambiguity_type,
             y = distance_bert_large_hf_layer_12,
             fill = ambiguity_type)) +
  geom_boxplot() +
  labs(x = "Ambiguity Type",
       y = "Cosine Distance (Final Layer)",
       fill = "Ambiguity Type") +
  theme_minimal()

df_merged %>%
  ggplot(aes(x = distance_bert_large_hf_layer_12,
             y = ambiguity_type,
             fill = ambiguity_type)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = 0.5, 
                       scale=0.85, 
                       size=.9, 
                       stat="density") +
  labs(x = "Cosine Distance (Final Layer)",
       y = "Ambiguity type",
       fill = "Ambiguity Type") +
  theme_minimal()

We find that a model predicting Distance (Final layer) with Ambiguity Type, and a random intercept for Anaphora, explains more variance than a model with only the random intercept.

model_full = lmer(data = df_merged,
                  distance_bert_large_hf_layer_12 ~ ambiguity_type + (1 | Anaphora),
                REML = FALSE)

model_reduced = lmer(data = df_merged,
                  distance_bert_large_hf_layer_12 ~ (1 | Anaphora),
                REML = FALSE)

summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: distance_bert_large_hf_layer_12 ~ ambiguity_type + (1 | Anaphora)
##    Data: df_merged
## 
##      AIC      BIC   logLik deviance df.resid 
##   -589.7   -567.2    300.9   -601.7      308 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2789 -0.6327 -0.0599  0.6889  3.1911 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Anaphora (Intercept) 0.035720 0.18900 
##  Residual             0.006571 0.08106 
## Number of obs: 314, groups:  Anaphora, 30
## 
## Fixed effects:
##                   Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)        0.43830    0.03757  31.44282  11.668 5.85e-13 ***
## ambiguity_typeIP  -0.02024    0.01340 286.37763  -1.510   0.1321    
## ambiguity_typeRP  -0.03551    0.01467 288.04514  -2.421   0.0161 *  
## ambiguity_typeUA  -0.03454    0.01448 286.50318  -2.385   0.0177 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) amb_IP amb_RP
## ambgty_tyIP -0.204              
## ambgty_tyRP -0.225  0.454       
## ambgty_tyUA -0.199  0.443  0.530
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: distance_bert_large_hf_layer_12 ~ (1 | Anaphora)
## model_full: distance_bert_large_hf_layer_12 ~ ambiguity_type + (1 | Anaphora)
##               npar     AIC     BIC logLik deviance  Chisq Df Pr(>Chisq)  
## model_reduced    3 -588.27 -577.02 297.14  -594.27                       
## model_full       6 -589.74 -567.24 300.87  -601.74 7.4658  3    0.05844 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_merged %>%
  group_by(ambiguity_type) %>%
  summarise(mean_distance = mean(distance_bert_large_hf_layer_12),
            sd_distance = sd(distance_bert_large_hf_layer_12))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   ambiguity_type mean_distance sd_distance
##   <chr>                  <dbl>       <dbl>
## 1 H                      0.515      0.0951
## 2 IP                     0.497      0.123 
## 3 RP                     0.464      0.121 
## 4 UA                     0.455      0.105

Does cosine distance predict similarity?

Here, we correlate cosine distance with similarity judgments, and analyze this across all layers of BERT.

df_all_layers = data.frame()
for (layer in 1:12) {
  
  col_name = paste("distance_bert_large_hf_layer", layer, sep="_") 
  col = df_merged[[col_name]]
  
  r = cor.test(col, df_merged$sim)
  
  df_r = broom::tidy(r)
  df_r$layer = layer
  
  df_all_layers = rbind(df_all_layers, df_r)
  
}

df_all_layers %>%
  ggplot(aes(x = layer,
             y = estimate)) +
  geom_line() +
  geom_errorbar(aes(ymin = conf.high, 
                    ymax = conf.low), 
                width=.2,
                position=position_dodge(.9)) +
  labs(x = "Layer",
       y = "Correlation between distance and similarity") +
  theme_minimal() +
  theme(axis.title = element_text(size=rel(2)),
        axis.text = element_text(size = rel(2)),
        legend.text = element_text(size = rel(2)),
        legend.title = element_text(size = rel(2)))

ggsave("../../Figures/r_layers.png", dpi = 300)
## Saving 7 x 5 in image
df_all_layers %>%
  filter(estimate == min(df_all_layers$estimate))
## # A tibble: 1 x 9
##   estimate statistic p.value parameter conf.low conf.high method alternative
##      <dbl>     <dbl>   <dbl>     <int>    <dbl>     <dbl> <chr>  <chr>      
## 1   -0.231     -4.20 3.44e-5       312   -0.334    -0.124 Pears… two.sided  
## # … with 1 more variable: layer <int>
## Now view layer
df_merged %>%
  ggplot(aes(x = distance_bert_large_hf_layer_12,
             y = sim,
             color = ambiguity_type,
             shape = ambiguity_type)) +
  geom_point(alpha = .6, size = 2) +
  labs(x = "Cosine Distance",
       y = "Similarity Judgment",
       color = "Ambiguity Type",
       shape = "Ambiguity Type") +
  theme_minimal() 

We also asked whether Distance (Layer 12) improves a model above and beyond ambiguity type.

model_full = lmer(data = df_merged,
                  sim ~ ambiguity_type + distance_bert_large_hf_layer_12 + (1 | Anaphora),
                  REML = FALSE)

model_reduced = lmer(data = df_merged,
                  sim ~ ambiguity_type  + (1 | Anaphora),
                  REML = FALSE)

summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: sim ~ ambiguity_type + distance_bert_large_hf_layer_12 + (1 |  
##     Anaphora)
##    Data: df_merged
## 
##      AIC      BIC   logLik deviance df.resid 
##    819.9    846.2   -403.0    805.9      307 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7912 -0.4819  0.0042  0.4082  3.9082 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Anaphora (Intercept) 0.01073  0.1036  
##  Residual             0.75476  0.8688  
## Number of obs: 314, groups:  Anaphora, 30
## 
## Fixed effects:
##                                 Estimate Std. Error       df t value Pr(>|t|)
## (Intercept)                       2.0842     0.2547 233.6958   8.182 1.79e-14
## ambiguity_typeIP                  0.9214     0.1389 312.9471   6.634 1.44e-10
## ambiguity_typeRP                  3.2300     0.1466 237.5380  22.035  < 2e-16
## ambiguity_typeUA                  4.5779     0.1457 271.2706  31.431  < 2e-16
## distance_bert_large_hf_layer_12  -0.7241     0.4500 288.9129  -1.609    0.109
##                                    
## (Intercept)                     ***
## ambiguity_typeIP                ***
## ambiguity_typeRP                ***
## ambiguity_typeUA                ***
## distance_bert_large_hf_layer_12    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) amb_IP amb_RP amb_UA
## ambgty_tyIP -0.338                     
## ambgty_tyRP -0.425  0.493              
## ambgty_tyUA -0.443  0.496  0.538       
## dstn_____12 -0.893  0.059  0.131  0.155
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: sim ~ ambiguity_type + (1 | Anaphora)
## model_full: sim ~ ambiguity_type + distance_bert_large_hf_layer_12 + (1 | 
## model_full:     Anaphora)
##               npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## model_reduced    6 820.49 842.99 -404.25   808.49                     
## model_full       7 819.93 846.18 -402.97   805.93 2.5644  1     0.1093

Part 2: Surprisal on anaphoric word

Here, we used a masked langauge modeling task to measure the probability of observing the anaphoric word in that position. This is meant to approximate measures of a subject's experience of encountering that anaphoric word, such as RT or the N400 effect.

Load data

df_surprisal = read_csv("../../data/processed/surprisals.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   `Similarity Norming Category` = col_character(),
##   probability = col_double(),
##   string = col_character(),
##   word = col_character()
## )
nrow(df_surprisal)
## [1] 314
df_merged = df_merged %>%
  inner_join(df_surprisal, by = c('word', 'Similarity Norming Category')) %>%
  mutate(surprisal = -log(probability))

nrow(df_merged)
## [1] 314

Does Ambiguity Type predict surprisal?

df_merged %>%
  ggplot(aes(x = surprisal,
             y = ambiguity_type,
             fill = ambiguity_type)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = 0.5, 
                       scale=0.85, 
                       size=.9, 
                       stat="density") +
  labs(x = "Surprisal of Masked Word",
       y = "Ambiguity type",
       fill = "Ambiguity Type") +
  theme_minimal() +
  theme(axis.title = element_text(size=rel(2)),
        axis.text = element_text(size = rel(2)),
        legend.text = element_text(size = rel(2)),
        legend.title = element_text(size = rel(2)))

ggsave("../../Figures/surprisal_condition.png", dpi = 300)
## Saving 7 x 5 in image
model_full = lmer(data = df_merged,
                  surprisal ~ ambiguity_type + (1 | Anaphora),
                REML = FALSE)

model_reduced = lmer(data = df_merged,
                  surprisal ~ (1 | Anaphora),
                REML = FALSE)

summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: surprisal ~ ambiguity_type + (1 | Anaphora)
##    Data: df_merged
## 
##      AIC      BIC   logLik deviance df.resid 
##   1170.5   1193.0   -579.3   1158.5      308 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5055 -0.5250  0.0620  0.6646  2.5062 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Anaphora (Intercept) 5.829    2.414   
##  Residual             1.874    1.369   
## Number of obs: 314, groups:  Anaphora, 30
## 
## Fixed effects:
##                   Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)        6.48397    0.50587  35.98690  12.817 5.59e-15 ***
## ambiguity_typeIP  -0.08125    0.22576 290.58920  -0.360  0.71918    
## ambiguity_typeRP  -0.68832    0.24683 292.90675  -2.789  0.00564 ** 
## ambiguity_typeUA  -0.40211    0.24386 291.16813  -1.649  0.10024    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) amb_IP amb_RP
## ambgty_tyIP -0.255              
## ambgty_tyRP -0.282  0.457       
## ambgty_tyUA -0.251  0.445  0.530
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: surprisal ~ (1 | Anaphora)
## model_full: surprisal ~ ambiguity_type + (1 | Anaphora)
##               npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
## model_reduced    3 1173.5 1184.7 -583.74   1167.5                       
## model_full       6 1170.5 1193.0 -579.27   1158.5 8.9396  3     0.0301 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_merged %>%
  group_by(ambiguity_type) %>%
  summarise(mean_surprisal = mean(surprisal),
            sd_surprisal = sd(surprisal))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   ambiguity_type mean_surprisal sd_surprisal
##   <chr>                   <dbl>        <dbl>
## 1 H                        6.42         1.73
## 2 IP                       6.56         2.17
## 3 RP                       5.06         2.17
## 4 UA                       5.19         1.93

Does surprisal predict similarity ratings beyond Ambiguity Type?

We also asked whether the surprisal of the anaphoric word was predictive of similarity above and beyond the ambiguity type category, and found that it was.

model_full = lmer(data = df_merged,
                  sim ~ surprisal + ambiguity_type + (1 | Anaphora),
                REML = FALSE)

model_reduced = lmer(data = df_merged,
                  sim ~ ambiguity_type + (1 | Anaphora),
                REML = FALSE)

summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: sim ~ surprisal + ambiguity_type + (1 | Anaphora)
##    Data: df_merged
## 
##      AIC      BIC   logLik deviance df.resid 
##    809.7    835.9   -397.8    795.7      307 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9800 -0.5274 -0.0347  0.4479  3.6869 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Anaphora (Intercept) 0.008238 0.09077 
##  Residual             0.731757 0.85543 
## Number of obs: 314, groups:  Anaphora, 30
## 
## Fixed effects:
##                   Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)        2.28351    0.19353  51.75943  11.799 2.72e-16 ***
## surprisal         -0.09208    0.02537  84.17449  -3.629 0.000487 ***
## ambiguity_typeIP   0.94693    0.13649 311.60784   6.938 2.31e-11 ***
## ambiguity_typeRP   3.17344    0.14487 204.79796  21.905  < 2e-16 ***
## ambiguity_typeUA   4.53230    0.14325 239.82428  31.638  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) srprsl amb_IP amb_RP
## surprisal   -0.820                     
## ambgty_tyIP -0.349 -0.023              
## ambgty_tyRP -0.541  0.184  0.479       
## ambgty_tyUA -0.530  0.172  0.483  0.541
anova(model_full, model_reduced)
## Data: df_merged
## Models:
## model_reduced: sim ~ ambiguity_type + (1 | Anaphora)
## model_full: sim ~ surprisal + ambiguity_type + (1 | Anaphora)
##               npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## model_reduced    6 820.49 842.99 -404.25   808.49                         
## model_full       7 809.67 835.92 -397.84   795.67 12.822  1  0.0003427 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_merged %>%
  ggplot(aes(x = surprisal,
             y = sim,
             color = ambiguity_type,
             shape = ambiguity_type)) +
  geom_point(alpha = .6, size = 2) +
  labs(x = "Surprisal of Masked Word",
       y = "Similarity Judgment",
       color = "Ambiguity Type",
       shape = "Ambiguity Type") +
  theme_minimal() 

cor.test(df_merged$surprisal, df_merged$sim)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged$surprisal and df_merged$sim
## t = -6.6958, df = 312, p-value = 9.982e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4475856 -0.2537325
## sample estimates:
##        cor 
## -0.3544618